For the DESeq2 analysis, the raw, prefiltered counts will be used.
se.gastroc <- readRDS("./data/Robjects/01_se.gastroc.rds")
se.soleus <- readRDS("./data/Robjects/01_se.soleus.rds")
counts.gastroc <- se.gastroc[rowData(se.gastroc)$filtered,] %>% assay()
counts.soleus <- se.soleus[rowData(se.soleus)$filtered,] %>% assay()
metadata <- colData(se.gastroc)
# trying to reorder the metadata, so that the design is correct (so far no luck)
# metadata_2 <- as.data.frame(metadata)
# metadata_2$genotype <- factor(metadata_2$genotype, levels = c("KO", "WT"))
# metadata_2 <- rbind(metadata[7:12,,drop=FALSE], metadata[1:6,,drop=FALSE])
metadata$genotype <- as.factor(metadata$genotype) # for DESeq
createDDSObject <- function(counts, metadata) {
# select sample columns
reorder_index <- match(rownames(metadata), colnames(counts))
counts <- counts[,reorder_index] %>%
apply(c(1,2), as.integer) # DESeq need normal counts (integer) as input
# Check metadata consistency
all(rownames(metadata) %in% colnames(counts)) %>%
assertthat::assert_that(., msg = "metadata and count table do not match")
## DESeq2 object
dds <- DESeqDataSetFromMatrix(countData = counts,
colData = metadata,
design = ~ genotype)
return( DESeq(dds) )
}
# creating DESeq Objects
dds.gastroc <- createDDSObject(counts.gastroc, metadata)
dds.soleus <- createDDSObject(counts.soleus, metadata)
res.gastroc <- results(dds.gastroc, alpha = params$pCutoff, contrast = c("genotype", "KO", "WT"))
res.soleus <- results(dds.soleus, alpha = params$pCutoff, contrast = c("genotype", "KO", "WT"))
# lfcShrink is not applied by default
# library(ashr) # using instead of "apeglm", because https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#extended-section-on-shrinkage-estimators
# resultsNames(dds.gastroc)
resLFC.gastroc <- lfcShrink(dds.gastroc, contrast = c("genotype", "KO", "WT"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
resLFC.soleus <- lfcShrink(dds.soleus, contrast = c("genotype", "KO", "WT"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
Filtering outliers using the cooks distance is done automatically and
p-values are set to NA. Changing (lowering) the
minReplicatesForReplace will replace these flagged
values.
DESeq2 flags these outliers and removes the corresponding genes completely, if below 7 samples (above would replace the values)
# assuming all NA p-values are the ones filtered out using the cooks distance
print_outliers <- function(res, se){
# all genes with a `NA` value are exactly those 5 which were also detected by the cooks distance:
outlier_ids <- res %>%
as.data.frame() %>%
filter(is.na(pvalue)) %>%
rownames()
# a different approach would be finding the genes which have a cooks distance
# above the cutoff:
# cooksCutoff <- qf(.99, 2,10)
# outlier_ids <- which(mcols(dds)$maxCooks > cooksCutoff) %>% names()
# getting counts and metadata
outlier_metadata.df <- rowData(se) %>%
as.data.frame() %>%
select(gene_name, gene_biotype)
counts.df <- assay(se)
# merging df
outliers.df <-
merge(outlier_metadata.df, counts.df[outlier_ids,], by = 0)
return(outliers.df)
}
outliers_gastroc.df <- print_outliers(res.gastroc, se.gastroc) %>% tibble::column_to_rownames(var = "Row.names")
write.csv(outliers_gastroc.df, file = "./code/analysis/03_outliers_gastroc.csv")
outliers_gastroc.df %>% knitr::kable()
print_outliers(res.soleus, se.soleus) %>% knitr::kable()
| gene_name | gene_biotype | WT_1 | WT_3 | WT_5 | WT_9 | WT_11 | WT_15 | KO_2 | KO_4 | KO_6 | KO_8 | KO_10 | KO_16 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ENSMUSG00000027375 | Mal | protein_coding | 496 | 82 | 87 | 88 | 118 | 71 | 112 | 80 | 70 | 99 | 164 | 200 |
| ENSMUSG00000039579 | Grin3a | protein_coding | 4 | 1 | 5 | 3 | 0 | 3 | 12 | 5 | 11 | 2 | 9 | 142 |
| ENSMUSG00000041607 | Mbp | protein_coding | 3390 | 462 | 524 | 654 | 836 | 536 | 849 | 493 | 456 | 606 | 1305 | 1094 |
| ENSMUSG00000053198 | Prx | protein_coding | 1190 | 159 | 236 | 327 | 346 | 200 | 313 | 196 | 197 | 237 | 428 | 372 |
| ENSMUSG00000056569 | Mpz | protein_coding | 7949 | 1168 | 1229 | 1562 | 1657 | 996 | 1792 | 1181 | 877 | 1447 | 2681 | 2691 |
| Row.names | gene_name | gene_biotype | WT_1 | WT_3 | WT_5 | WT_9 | WT_11 | WT_15 | KO_2 | KO_4 | KO_6 | KO_8 | KO_10 | KO_16 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ENSMUSG00000000290 | Itgb2 | protein_coding | 22 | 47 | 45 | 50 | 72 | 18 | 38 | 457 | 44 | 58 | 66 | 63 |
| ENSMUSG00000000682 | Cd52 | protein_coding | 8 | 34 | 13 | 18 | 19 | 24 | 10 | 204 | 23 | 15 | 21 | 22 |
| ENSMUSG00000000982 | Ccl3 | protein_coding | 0 | 0 | 0 | 3 | 1 | 0 | 1 | 25 | 0 | 0 | 3 | 0 |
| ENSMUSG00000001131 | Timp1 | protein_coding | 21 | 39 | 25 | 32 | 30 | 38 | 94 | 1080 | 85 | 33 | 81 | 68 |
| ENSMUSG00000002985 | Apoe | protein_coding | 762 | 1049 | 1260 | 1220 | 1418 | 1005 | 1417 | 10248 | 1592 | 1330 | 1474 | 1369 |
| ENSMUSG00000003484 | Cyp4f18 | protein_coding | 2 | 7 | 3 | 8 | 9 | 3 | 17 | 166 | 10 | 15 | 12 | 4 |
| ENSMUSG00000003882 | Il7r | protein_coding | 3 | 3 | 1 | 7 | 2 | 0 | 2 | 129 | 3 | 5 | 6 | 6 |
| ENSMUSG00000004707 | Ly9 | protein_coding | 0 | 0 | 4 | 5 | 8 | 0 | 4 | 122 | 5 | 2 | 0 | 1 |
| ENSMUSG00000005087 | Cd44 | protein_coding | 183 | 229 | 200 | 236 | 240 | 190 | 324 | 1750 | 335 | 262 | 414 | 265 |
| ENSMUSG00000006519 | Cyba | protein_coding | 135 | 138 | 144 | 132 | 124 | 165 | 177 | 732 | 128 | 166 | 214 | 129 |
| ENSMUSG00000015947 | Fcgr1 | protein_coding | 6 | 5 | 12 | 6 | 10 | 3 | 7 | 143 | 15 | 4 | 10 | 6 |
| ENSMUSG00000015950 | Ncf1 | protein_coding | 22 | 51 | 59 | 45 | 71 | 18 | 67 | 497 | 76 | 36 | 75 | 55 |
| ENSMUSG00000017716 | Birc5 | protein_coding | 7 | 21 | 7 | 8 | 15 | 16 | 16 | 227 | 18 | 19 | 18 | 21 |
| ENSMUSG00000018927 | Ccl6 | protein_coding | 83 | 164 | 172 | 135 | 175 | 95 | 206 | 1248 | 182 | 114 | 204 | 109 |
| ENSMUSG00000019876 | Pkib | protein_coding | 2 | 2 | 6 | 2 | 4 | 4 | 4 | 92 | 3 | 3 | 1 | 4 |
| ENSMUSG00000019987 | Arg1 | protein_coding | 5 | 6 | 2 | 13 | 6 | 1 | 8 | 108 | 6 | 17 | 10 | 9 |
| ENSMUSG00000020120 | Plek | protein_coding | 17 | 39 | 30 | 32 | 48 | 22 | 36 | 407 | 44 | 32 | 21 | 29 |
| ENSMUSG00000020865 | Abcc3 | protein_coding | 14 | 13 | 22 | 16 | 19 | 11 | 17 | 184 | 23 | 19 | 23 | 13 |
| ENSMUSG00000021091 | Serpina3n | protein_coding | 19 | 22 | 29 | 22 | 22 | 22 | 65 | 404 | 42 | 31 | 49 | 57 |
| ENSMUSG00000021175 | Cdca7l | protein_coding | 0 | 3 | 2 | 2 | 0 | 0 | 0 | 34 | 2 | 2 | 0 | 2 |
| ENSMUSG00000021886 | Gpr65 | protein_coding | 0 | 2 | 5 | 4 | 5 | 5 | 5 | 56 | 5 | 4 | 6 | 3 |
| ENSMUSG00000021998 | Lcp1 | protein_coding | 95 | 87 | 97 | 126 | 124 | 73 | 134 | 1457 | 185 | 129 | 187 | 73 |
| ENSMUSG00000022534 | Mefv | protein_coding | 1 | 9 | 1 | 1 | 2 | 0 | 0 | 33 | 2 | 1 | 0 | 2 |
| ENSMUSG00000024300 | Myo1f | protein_coding | 19 | 34 | 19 | 37 | 26 | 22 | 25 | 349 | 42 | 32 | 23 | 25 |
| ENSMUSG00000024397 | Aif1 | protein_coding | 9 | 6 | 7 | 4 | 9 | 10 | 5 | 128 | 8 | 9 | 15 | 10 |
| ENSMUSG00000024529 | Lox | protein_coding | 74 | 190 | 110 | 93 | 154 | 108 | 201 | 820 | 161 | 129 | 151 | 152 |
| ENSMUSG00000024675 | Ms4a4c | protein_coding | 0 | 0 | 5 | 0 | 2 | 0 | 3 | 26 | 0 | 0 | 3 | 0 |
| ENSMUSG00000024677 | Ms4a6b | protein_coding | 13 | 32 | 27 | 24 | 42 | 16 | 21 | 179 | 23 | 18 | 23 | 19 |
| ENSMUSG00000024679 | Ms4a6d | protein_coding | 9 | 17 | 12 | 15 | 21 | 13 | 12 | 305 | 22 | 20 | 15 | 6 |
| ENSMUSG00000025044 | Msr1 | protein_coding | 10 | 7 | 12 | 15 | 22 | 2 | 2 | 358 | 23 | 14 | 15 | 17 |
| ENSMUSG00000025473 | Adam8 | protein_coding | 30 | 57 | 44 | 53 | 39 | 38 | 49 | 525 | 45 | 38 | 52 | 72 |
| ENSMUSG00000025877 | Hk3 | protein_coding | 9 | 10 | 7 | 3 | 9 | 2 | 11 | 165 | 9 | 16 | 10 | 10 |
| ENSMUSG00000026358 | Rgs1 | protein_coding | 3 | 4 | 0 | 0 | 1 | 0 | 0 | 88 | 0 | 2 | 0 | 1 |
| ENSMUSG00000029304 | Spp1 | protein_coding | 77 | 47 | 68 | 34 | 78 | 113 | 97 | 4879 | 96 | 61 | 63 | 62 |
| ENSMUSG00000029373 | Pf4 | protein_coding | 56 | 69 | 105 | 115 | 111 | 45 | 57 | 846 | 99 | 83 | 75 | 50 |
| ENSMUSG00000029915 | Clec5a | protein_coding | 8 | 10 | 1 | 7 | 4 | 8 | 7 | 86 | 11 | 6 | 10 | 6 |
| ENSMUSG00000030117 | Gdf3 | protein_coding | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 129 | 10 | 1 | 3 | 1 |
| ENSMUSG00000030144 | Clec4d | protein_coding | 0 | 0 | 0 | 2 | 1 | 4 | 3 | 110 | 4 | 0 | 5 | 3 |
| ENSMUSG00000030365 | Clec2i | protein_coding | 3 | 3 | 2 | 8 | 2 | 5 | 0 | 44 | 2 | 3 | 4 | 0 |
| ENSMUSG00000030579 | Tyrobp | protein_coding | 19 | 25 | 45 | 39 | 25 | 18 | 24 | 508 | 34 | 13 | 26 | 20 |
| ENSMUSG00000030786 | Itgam | protein_coding | 9 | 36 | 16 | 20 | 39 | 6 | 18 | 450 | 36 | 14 | 36 | 23 |
| ENSMUSG00000030789 | Itgax | protein_coding | 24 | 20 | 8 | 9 | 9 | 10 | 17 | 141 | 20 | 19 | 29 | 24 |
| ENSMUSG00000031443 | F7 | protein_coding | 1 | 9 | 4 | 3 | 1 | 2 | 16 | 180 | 4 | 3 | 13 | 7 |
| ENSMUSG00000031827 | Cotl1 | protein_coding | 76 | 128 | 118 | 132 | 133 | 101 | 134 | 1350 | 156 | 175 | 174 | 125 |
| ENSMUSG00000032122 | Slc37a2 | protein_coding | 26 | 44 | 36 | 27 | 20 | 29 | 25 | 244 | 35 | 52 | 36 | 22 |
| ENSMUSG00000032218 | Ccnb2 | protein_coding | 14 | 22 | 24 | 4 | 13 | 17 | 27 | 129 | 15 | 26 | 20 | 15 |
| ENSMUSG00000032322 | Pstpip1 | protein_coding | 8 | 29 | 19 | 15 | 26 | 28 | 30 | 186 | 30 | 36 | 34 | 29 |
| ENSMUSG00000032511 | Scn5a | protein_coding | 47 | 49 | 51 | 86 | 108 | 55 | 74 | 542 | 87 | 73 | 92 | 87 |
| ENSMUSG00000033220 | Rac2 | protein_coding | 17 | 16 | 23 | 27 | 13 | 15 | 23 | 186 | 14 | 20 | 33 | 26 |
| ENSMUSG00000033777 | Tlr13 | protein_coding | 3 | 6 | 2 | 10 | 11 | 9 | 17 | 217 | 23 | 12 | 8 | 5 |
| ENSMUSG00000034947 | Tmem106a | protein_coding | 45 | 41 | 84 | 76 | 80 | 53 | 64 | 391 | 63 | 86 | 97 | 50 |
| ENSMUSG00000035202 | Lars2 | protein_coding | 1830 | 4441 | 2177 | 1921 | 1785 | 1989 | 7700 | 1688 | 1395 | 1567 | 1838 | 1302 |
| ENSMUSG00000035373 | Ccl7 | protein_coding | 3 | 3 | 3 | 14 | 3 | 0 | 5 | 398 | 16 | 17 | 10 | 6 |
| ENSMUSG00000035385 | Ccl2 | protein_coding | 10 | 12 | 11 | 43 | 15 | 1 | 16 | 348 | 14 | 12 | 41 | 5 |
| ENSMUSG00000036887 | C1qa | protein_coding | 115 | 166 | 220 | 242 | 260 | 142 | 199 | 1303 | 256 | 195 | 226 | 180 |
| ENSMUSG00000037280 | Galnt6 | protein_coding | 4 | 4 | 4 | 2 | 3 | 3 | 4 | 108 | 7 | 1 | 6 | 3 |
| ENSMUSG00000037649 | H2-DMa | protein_coding | 28 | 39 | 31 | 55 | 20 | 24 | 59 | 227 | 37 | 36 | 47 | 49 |
| ENSMUSG00000038147 | Cd84 | protein_coding | 8 | 16 | 13 | 8 | 16 | 10 | 4 | 408 | 15 | 9 | 21 | 32 |
| ENSMUSG00000038642 | Ctss | protein_coding | 74 | 111 | 107 | 110 | 99 | 50 | 110 | 3593 | 202 | 87 | 143 | 94 |
| ENSMUSG00000040204 | Pclaf | protein_coding | 8 | 8 | 8 | 7 | 5 | 5 | 3 | 197 | 15 | 13 | 14 | 6 |
| ENSMUSG00000040552 | C3ar1 | protein_coding | 14 | 13 | 32 | 36 | 51 | 3 | 48 | 689 | 41 | 20 | 49 | 22 |
| ENSMUSG00000040747 | Cd53 | protein_coding | 21 | 24 | 32 | 51 | 38 | 15 | 39 | 373 | 52 | 30 | 38 | 38 |
| ENSMUSG00000040809 | Chil3 | protein_coding | 1 | 0 | 0 | 0 | 6 | 0 | 0 | 66 | 0 | 6 | 4 | 1 |
| ENSMUSG00000040907 | Atp1a3 | protein_coding | 5 | 7 | 3 | 4 | 5 | 6 | 3 | 148 | 8 | 3 | 13 | 12 |
| ENSMUSG00000041431 | Ccnb1 | protein_coding | 4 | 14 | 2 | 1 | 5 | 0 | 5 | 118 | 5 | 1 | 13 | 8 |
| ENSMUSG00000043091 | Tuba1c | protein_coding | 113 | 105 | 81 | 135 | 172 | 142 | 126 | 606 | 125 | 108 | 119 | 107 |
| ENSMUSG00000043157 | Arl11 | protein_coding | 4 | 4 | 5 | 3 | 11 | 2 | 8 | 114 | 8 | 1 | 3 | 4 |
| ENSMUSG00000044811 | Cd300c2 | protein_coding | 4 | 7 | 4 | 8 | 3 | 3 | 9 | 178 | 12 | 14 | 12 | 11 |
| ENSMUSG00000044827 | Tlr1 | protein_coding | 3 | 1 | 3 | 0 | 1 | 0 | 2 | 78 | 6 | 3 | 9 | 2 |
| ENSMUSG00000046805 | Mpeg1 | protein_coding | 29 | 58 | 47 | 47 | 57 | 35 | 59 | 2548 | 237 | 45 | 87 | 67 |
| ENSMUSG00000047798 | Cd300lf | protein_coding | 0 | 3 | 1 | 1 | 2 | 1 | 6 | 126 | 1 | 0 | 2 | 3 |
| ENSMUSG00000048779 | P2ry6 | protein_coding | 27 | 15 | 31 | 45 | 52 | 29 | 47 | 274 | 38 | 35 | 55 | 34 |
| ENSMUSG00000052142 | Rasal3 | protein_coding | 10 | 5 | 3 | 7 | 6 | 4 | 3 | 54 | 7 | 4 | 4 | 1 |
| ENSMUSG00000052336 | Cx3cr1 | protein_coding | 12 | 18 | 11 | 10 | 24 | 7 | 27 | 366 | 23 | 8 | 9 | 12 |
| ENSMUSG00000055850 | Rnf181 | protein_coding | 41 | 20 | 42 | 23 | 45 | 355 | 48 | 38 | 37 | 35 | 341 | 29 |
| ENSMUSG00000056069 | Fam105a | protein_coding | 29 | 28 | 28 | 22 | 29 | 26 | 30 | 270 | 37 | 30 | 25 | 31 |
| ENSMUSG00000063415 | Cyp26b1 | protein_coding | 248 | 1257 | 198 | 94 | 293 | 99 | 4027 | 59 | 111 | 54 | 37 | 191 |
| ENSMUSG00000068606 | Gm4841 | protein_coding | 3 | 0 | 161 | 1 | 1 | 2 | 0 | 45 | 8 | 265 | 37 | 14 |
| ENSMUSG00000069516 | Lyz2 | protein_coding | 408 | 806 | 793 | 912 | 972 | 506 | 672 | 12094 | 1278 | 687 | 879 | 561 |
| ENSMUSG00000070780 | Rbm47 | protein_coding | 3 | 7 | 5 | 2 | 6 | 1 | 4 | 89 | 7 | 5 | 10 | 5 |
| ENSMUSG00000073412 | Lst1 | protein_coding | 6 | 2 | 7 | 5 | 5 | 3 | 4 | 57 | 2 | 4 | 4 | 6 |
| ENSMUSG00000076258 | Gm23935 | miRNA | 2112 | 8702 | 2434 | 2297 | 1344 | 2648 | 16331 | 1687 | 1437 | 1455 | 2057 | 1453 |
| ENSMUSG00000076281 | Gm24270 | miRNA | 22 | 63 | 36 | 28 | 17 | 30 | 152 | 18 | 12 | 14 | 30 | 20 |
| ENSMUSG00000078122 | F630028O10Rik | antisense | 0 | 15 | 11 | 12 | 13 | 8 | 5 | 97 | 8 | 15 | 9 | 9 |
| ENSMUSG00000078880 | Gm14308 | protein_coding | 4 | 1 | 106 | 1 | 0 | 0 | 2 | 4 | 4 | 2 | 3 | 0 |
| ENSMUSG00000078899 | Gm4631 | protein_coding | 0 | 0 | 0 | 3 | 0 | 3 | 81 | 0 | 1 | 1 | 1 | 1 |
| ENSMUSG00000079227 | Ccr5 | protein_coding | 13 | 5 | 2 | 21 | 21 | 5 | 15 | 262 | 16 | 16 | 14 | 5 |
| ENSMUSG00000079419 | Ms4a6c | protein_coding | 3 | 12 | 11 | 8 | 11 | 7 | 9 | 85 | 9 | 6 | 5 | 5 |
| ENSMUSG00000095380 | Gm23952 | miRNA | 3 | 55 | 7 | 5 | 5 | 7 | 10 | 5 | 6 | 7 | 7 | 8 |
| ENSMUSG00000103349 | Gm36888 | lincRNA | 2 | 1 | 14 | 0 | 1 | 3 | 4 | 0 | 0 | 0 | 24 | 0 |
| ENSMUSG00000115230 | AC101945.2 | lincRNA | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 37 | 0 | 1 | 1 | 3 |
sizeFactors(dds.gastroc)
sizeFactors(dds.soleus)
## WT_1 WT_3 WT_5 WT_9 WT_11 WT_15 KO_2 KO_4 KO_6 KO_8 KO_10
## 1.0098829 0.6517416 0.7363933 1.1208488 0.9796707 0.7135590 1.2740994 1.0606543 1.0397324 1.2073234 1.0876389
## KO_16
## 1.6003663
## WT_1 WT_3 WT_5 WT_9 WT_11 WT_15 KO_2 KO_4 KO_6 KO_8 KO_10
## 0.8782302 0.9413594 0.9933758 0.9259800 1.0048036 0.8889040 1.1802624 1.1871833 0.9450776 1.1361948 1.1343119
## KO_16
## 0.9180831
plotDispEsts(dds.gastroc)
plotDispEsts(dds.soleus)
plotMA(res.gastroc, ylim = c(-6,6))
plotMA(res.soleus, ylim = c(-6,6))
topGenes.gastroc <- as.data.frame(res.gastroc) %>%
tibble::rownames_to_column("GeneID") %>%
top_n(100, wt = -padj) %>%
arrange(padj)
topGenes.soleus <- as.data.frame(res.soleus) %>%
tibble::rownames_to_column("GeneID") %>%
top_n(100, wt = -padj) %>%
arrange(padj)
knitr::kable(head(topGenes.gastroc), caption = "gastroc")
knitr::kable(head(topGenes.soleus), caption = "soleus")
| GeneID | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj |
|---|---|---|---|---|---|---|
| ENSMUSG00000050335 | 4934.2949 | 6.267396 | 0.1664065 | 37.66316 | 0 | 0 |
| ENSMUSG00000081249 | 1378.5460 | 4.208222 | 0.1115955 | 37.70959 | 0 | 0 |
| ENSMUSG00000010751 | 1864.4937 | 5.610357 | 0.2014267 | 27.85309 | 0 | 0 |
| ENSMUSG00000032712 | 5136.5059 | 4.686418 | 0.1683204 | 27.84225 | 0 | 0 |
| ENSMUSG00000034855 | 554.3475 | 7.276611 | 0.2735037 | 26.60517 | 0 | 0 |
| ENSMUSG00000037613 | 960.7558 | 3.183338 | 0.1266510 | 25.13472 | 0 | 0 |
| GeneID | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj |
|---|---|---|---|---|---|---|
| ENSMUSG00000038615 | 32047.185 | -2.5454285 | 0.0636829 | -39.97038 | 0 | 0 |
| ENSMUSG00000081249 | 1651.592 | 4.5759032 | 0.1292779 | 35.39587 | 0 | 0 |
| ENSMUSG00000039067 | 4355.557 | -1.1049116 | 0.0419786 | -26.32084 | 0 | 0 |
| ENSMUSG00000028932 | 3190.657 | -1.1102258 | 0.0558536 | -19.87741 | 0 | 0 |
| ENSMUSG00000006998 | 7402.617 | -0.9196943 | 0.0468631 | -19.62511 | 0 | 0 |
| ENSMUSG00000102869 | 6985.622 | -0.9181929 | 0.0479924 | -19.13203 | 0 | 0 |
# TODO: use ggplot for unified look of plots
hist(res.gastroc$pvalue, main = "gastroc")
hist(res.soleus$pvalue, main = "soleus")
volcanoPlot <- function(result, se, pCutoff = 0.01, FCutoff = 1, tissue = character()) {
gene_names <-
rowData(se)[rownames(result), c("gene_name"), drop = F]
results.df <- result %>%
as.data.frame() %>%
dplyr::arrange(padj)
# top 10 gene labels for respectively up and down regulation
labs.up <- results.df[results.df$log2FoldChange > FCutoff, ] %>%
rownames() %>% .[1:10] %>% gene_names[., c("gene_name")]
labs.down <- results.df[results.df$log2FoldChange < -FCutoff, ] %>%
rownames() %>% .[1:10] %>% gene_names[., c("gene_name")]
selectLab <- c(labs.up, labs.down, "Nfe2l1") %>% unique() # always including "Nfe2l1"
# custom colors:
keyvals <- ifelse(
result$log2FoldChange > FCutoff &
result$padj < pCutoff,
params$regulated_pal$upregulated,
ifelse(
result$log2FoldChange < -FCutoff &
result$padj < pCutoff,
params$regulated_pal$downregulated,
params$regulated_pal$insignificant
)
)
keyvals[is.na(keyvals)] <- params$regulated_pal$insignificant
names(keyvals)[keyvals == params$regulated_pal$upregulated] <- 'up regulated'
names(keyvals)[keyvals == params$regulated_pal$insignificant] <- 'nonsignificant'
names(keyvals)[keyvals == params$regulated_pal$downregulated] <- 'down regulated'
vlc.plt <- EnhancedVolcano(
result,
x = 'log2FoldChange',
y = 'padj',
title = 'KO vs WT: Nfe2l1 knockout',
subtitle = ifelse(isEmpty(tissue), "", paste0('tissue: ', tissue)),
caption = "",
ylab = expression(paste(-Log[10], p[adj])),
pCutoff = pCutoff,
FCcutoff = FCutoff,
legendPosition = 'right',
pointSize = 2,
colCustom = keyvals,
lab = gene_names$gene_name,
selectLab = selectLab,
labSize = 3,
boxedLabels = TRUE,
drawConnectors = TRUE,
max.overlaps = Inf
)
return(vlc.plt)
}
p <- volcanoPlot(res.gastroc, se.gastroc, pCutoff = params$pCutoff, FCutoff = params$FCutoff, tissue = "gastrocnemius")
ggsave(filename = "plots/02_volcano_gastroc.svg", p)
## Saving 9 x 9 in image
ggsave(filename = "plots/02_volcano_gastroc.png", p)
## Saving 9 x 9 in image
p
p <- volcanoPlot(res.soleus, se.soleus, pCutoff = params$pCutoff, FCutoff = params$FCutoff, tissue = "soleus")
ggsave(filename = "plots/02_volcano_soleus.svg", p)
## Saving 9 x 9 in image
ggsave(filename = "plots/02_volcano_soleus.png", p)
## Saving 9 x 9 in image
p
p <- volcanoPlot(resLFC.gastroc, se.gastroc, pCutoff = params$pCutoff, FCutoff = params$FCutoff, tissue = "gastrocnemius")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
ggsave(filename = "plots/02_volcano_gastroc_LFC.svg", p)
## Saving 9 x 9 in image
ggsave(filename = "plots/02_volcano_gastroc_LFC.png", p)
## Saving 9 x 9 in image
p
p <- volcanoPlot(resLFC.soleus, se.soleus, pCutoff = params$pCutoff, FCutoff = params$FCutoff, tissue = "soleus")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
ggsave(filename = "plots/02_volcano_soleus_LFC.svg", p)
## Saving 9 x 9 in image
ggsave(filename = "plots/02_volcano_soleus_LFC.png", p)
## Saving 9 x 9 in image
p
using the Wald-test stat from the DESeq2 result and
filtering on the set FCutoff=1 and
pCutoff=0.01 yields the following plot:
apply_cutoffs <- function(deseq.result, colname="stat", FCutoff, pCutoff) {
res.filtered <- deseq.result %>%
data.frame() %>%
filter(padj < pCutoff,
log2FoldChange > FCutoff | log2FoldChange < -FCutoff) %>%
dplyr::rename(!!colname := stat) %>%
dplyr::select(!!colname)
return(res.filtered)
}
gastroc_res.filtered <- apply_cutoffs(res.gastroc, colname="gastroc", params$FCutoff, params$pCutoff)
soleus_res.filtered <- apply_cutoffs(res.soleus, colname="soleus", params$FCutoff, params$pCutoff)
gene_names <- rowData(se.gastroc) %>% as.data.frame() %>%
dplyr::select(gene_name)
# combining Wald-Test data from both tissues and ordering in quadrants
res.combined <- merge(gastroc_res.filtered,
soleus_res.filtered,
by = 0) %>%
mutate(diff.exp = case_when(
gastroc < 0 & soleus < 0 ~ "both down",
gastroc > 0 & soleus > 0 ~ "both up",
gastroc < 0 & soleus > 0 ~ "gastrocnemius down,\n soleus up",
gastroc > 0 & soleus < 0 ~ "gastrocnemus up,\n soleus down",
TRUE ~ "different"
)) %>%
merge(gene_names, by.x="Row.names", by.y=0)
# removing all gene_names except the top_n_genes (sum of absolute Wald-test numbers)
top_n_genes <- 10
top_labels <- res.combined %>%
group_by(diff.exp) %>%
arrange(desc(abs(gastroc) + abs(soleus))) %>%
filter(row_number() %in% c(1:top_n_genes)) %>%
ungroup() %>%
.$gene_name
res.combined <- res.combined %>%
mutate(gene_name = ifelse(gene_name %in% top_labels, gene_name, ""))
# final plot
p <- ggplot(res.combined, aes(x = gastroc, y = soleus, label = gene_name)) +
geom_vline(xintercept = 0) +
geom_hline(yintercept = 0) +
geom_point(aes(color = diff.exp)) +
# scale_color_manual(values = c("red", "chartreuse1", "bisque", "royalblue")) +
labs(x = "gastrocnemius", y = "soleus", color = "significantly\ndifferentially\nexpressed") +
ggrepel::geom_label_repel(max.overlaps = 15) +
ggtitle(label = "DEA: t-statistics (Wald test)") +
theme_bw() +
theme(
axis.title = element_text(size = 13),
axis.text = element_text(size = 13),
title = element_text(size = 13),
legend.text = element_text(size = 10)
)
ggsave(filename = "plots/02_dea_scatter.svg", p)
## Saving 9 x 9 in image
ggsave(filename = "plots/02_dea_scatter.png", p)
## Saving 9 x 9 in image
p
p <- ggplot(res.combined, aes(x = diff.exp)) +
geom_bar(aes(fill = diff.exp)) +
theme_bw()
ggsave(filename = "plots/02_dea_scatter_barplot.svg", p)
## Saving 7 x 5 in image
ggsave(filename = "plots/02_dea_scatter_barplot.png", p)
## Saving 7 x 5 in image
p
boxplot.genes <- function(counts.df, title ="", FCutoff = 1) {
ggplot(counts.df,
aes(
# x = as.factor(gene_name),
x = reorder(gene_name,ensembl),
y = normalized_counts,
fill = genotype
)) +
geom_dotplot(
binaxis = 'y',
stackdir = 'center',
dotsize = 0.3,
position = position_dodge(0.8),
fill = "black"
) +
geom_boxplot(outlier.size = 0.3) +
scale_y_log10() +
xlab("Genes") +
ylab("Normalized Counts") +
scale_fill_manual(values = condition_pal) +
ggtitle(title) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(
angle = 45,
hjust = 1,
face = setBold(counts.df$gene_name, c("Nfe2l1"))
))
}
prepare_counts <- function(gene_ids, dds, se, reorder_genes = F) {
counts.top <- counts(dds, normalized = T)[gene_ids, ]
metadata <- colData(se) %>% as.data.frame()
gene_names <-
rowData(se)[, c("gene_name"), drop = F] %>% as.data.frame()
# the prepared counts df for the plot
counts.plt <-
data.frame(counts.top) %>%
tibble::rownames_to_column(var = "ensembl") %>%
tidyr::gather(key = "samplename",
value = "normalized_counts", 2:13) %>%
merge(metadata, by.x = "samplename", by.y = 0) %>%
merge(gene_names, by.x = "ensembl", by.y = 0) %>%
mutate(genotype = factor(genotype)) %>%
mutate(genotype = relevel(genotype, "WT")) # "WT" needs to be displayed before "KO"
if (reorder_genes) {
counts.plt <- counts.plt %>%
mutate(
gene_name = forcats::fct_reorder(gene_name, normalized_counts, .desc = T),
ensembl = forcats::fct_reorder(ensembl, normalized_counts, .desc = T)
)
} else {
idx = match(counts.plt$ensembl, gene_ids)
counts.plt <- counts.plt[order(idx),] %>%
mutate(gene_name = factor(gene_name))# %>%
# arrange(gene_name)
counts.plt$ensembl <- factor(counts.plt$ensembl, levels = gene_ids,ordered = TRUE)
counts.plt$gene_name <- factor(counts.plt$gene_name)
# counts.plt$gene_name <- factor(counts.plt$gene_name, levels = gene_ids)
# counts.plt <- counts.plt %>%
# mutate(gene_name = factor(gene_name, levels = gene_ids)) %>%
# arrange()
}
return(counts.plt)
}
# helper function to set the font of a label on the axis to bold
setBold <- function(src, special_labs) {
# source: https://stackoverflow.com/questions/39694490/highlighting-individual-axis-labels-in-bold-using-ggplot2
if (!is.factor(src))
src <- factor(src)
src_levels <- base::levels(src)
brave <- special_labs %in% src_levels
b_vec <- rep("plain", length(src_levels))
if (all(brave)) {
b_pos <- purrr::map_int(special_labs, ~ which(. == src_levels))
b_vec[b_pos] <- "bold"
b_vec
} else {
message("setBold: no matching element found")
}
return(b_vec)
}
# actual plot call
for (group in unique(res.combined$diff.exp)) {
gene_ids <- filter(res.combined, diff.exp == group)$`Row.names`
prep_counts.gastroc <- prepare_counts(gene_ids, dds.gastroc, se.gastroc, reorder_genes = T)
prep_counts.soleus <-
prepare_counts(levels(prep_counts.gastroc$ensembl), dds.soleus, se.soleus, reorder_genes = T)
gene_ids <- levels(prep_counts.gastroc$ensembl)
p.g <- boxplot.genes(
counts.df = prep_counts.gastroc,
title = paste0("Sign. regulated genes: ", group, "\n tissue: ", "gastrocnemius")
) #%>% print()
p.s <- boxplot.genes(
counts.df = prep_counts.soleus,
title = paste0("\n tissue: ", "soleus")
) #%>% print()
ggpubr::ggarrange(p.g, p.s, ncol = 1, nrow = 2)
}
setBold <- function(src, special_labs) {
# source: https://stackoverflow.com/questions/39694490/highlighting-individual-axis-labels-in-bold-using-ggplot2
if (!is.factor(src))
src <- factor(src)
src_levels <- base::levels(src)
brave <- special_labs %in% src_levels
b_vec <- rep("plain", length(src_levels))
if (all(brave)) {
b_pos <- purrr::map_int(special_labs, ~ which(. == src_levels))
b_vec[b_pos] <- "bold"
b_vec
} else {
message("setBold: no matching element found")
}
return(b_vec)
}
getTopExpressedEnsemblNames <- function(result,
FCutoff,
n,
up = T) {
.filter <- ifelse(up, `>`, `<`)
subset(result, .filter(log2FoldChange, FCutoff)) %>%
data.frame() %>%
filter(baseMean > 100) %>%
arrange(padj) %>%
.[1:n, ] %>%
rownames()
}
boxplot.top <- function(result, dds, se, upregulated=TRUE, n = 20, FCutoff = 1, tissue ="") {
# getting the top n regulated genes
names.top <- getTopExpressedEnsemblNames(result, FCutoff=FCutoff, n=n, up = upregulated)
counts.top <- counts(dds, normalized=T)[names.top, ]
metadata <- colData(se) %>% as.data.frame()
gene_names <- rowData(se)[, c("gene_name"), drop = F] %>% as.data.frame()
counts.plt <-
data.frame(counts.top) %>%
tibble::rownames_to_column(var = "ensembl") %>%
tidyr::gather(key = "samplename",
value = "normalized_counts", 2:13) %>%
merge(metadata, by.x="samplename", by.y=0) %>%
merge(gene_names, by.x="ensembl", by.y=0) %>%
mutate(genotype = factor(genotype)) %>%
mutate(genotype = relevel(genotype, "WT")) %>% # "WT" needs to be displayed before "KO"
mutate(gene_name = forcats::fct_reorder(gene_name, normalized_counts, .desc = T))
direction <- ifelse(upregulated, "up", "down")
ggplot(counts.plt,
aes(
x = as.factor(gene_name),
y = normalized_counts,
fill = genotype
)) +
geom_dotplot(
binaxis = 'y',
stackdir = 'center',
dotsize = 0.3,
position = position_dodge(0.8),
fill = "black"
) +
geom_boxplot(outlier.size = 0.3) +
scale_y_log10() +
xlab("Genes") +
ylab("Normalized Counts") +
scale_fill_manual(values = condition_pal) +
ggtitle(paste0("Top ", n, " ", direction, "-regulated Genes\n tissue: ", tissue)) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(
angle = 45,
hjust = 1,
face = setBold(counts.plt$gene_name, c("Nfe2l1"))
))
}
boxplot.top(res.gastroc, dds.gastroc, se.gastroc, upregulated = T, tissue = "gastrocnemius")
boxplot.top(res.gastroc, dds.gastroc, se.gastroc, upregulated = F, tissue = "gastrocnemius")
boxplot.top(res.soleus, dds.soleus, se.soleus, upregulated = T, tissue = "soleus")
boxplot.top(res.soleus, dds.soleus, se.soleus, upregulated = F, tissue = "soleus")
Here are some Venn or Euler (proportional Venn) diagrams to choose from.
Note: using Eulerr diagrams, yields inconsistent plots where the circle sets will be placed at different absolute positions each time the plot function is called. => Thus the plot might need to be called multiple times until the wanted constellation is obtained.
significant overview with all genes:
venn.colors <- c(genes = "white", params$tissue_pal)
gene_sets <- c(
"all genes" = sign_gene_stats$`all genes` - sign_gene_stats$shared_sig_genes,
"all genes&gastroc" = sign_gene_stats$gastroc - sign_gene_stats$shared_sig_genes,
"all genes&soleus" = sign_gene_stats$soleus - sign_gene_stats$shared_sig_genes,
"all genes&gastroc&soleus" = sign_gene_stats$shared_sig_genes
)
# 1st option: euler
plot(euler(gene_sets),
legend = list(side = "right") ,
main = "significant genes per tissue",
fills = venn.colors)
# 2nd option: venn
plot(venn(gene_sets),
main = "significant genes per tissue",
fills = venn.colors)
significant grouped:
col_palette <- RColorBrewer::brewer.pal(8, "Dark2")
# venn.colors <- c(gastroc = "#FFB08E", soleus = "#FF6638", col_palette[1:4])
venn.colors <- c(params$tissue_pal, scales::hue_pal()(4))
# sets for the venn/euler diagram (exclusive counts)
gene_sets <- c(
"gastrocnemius" = sign_gene_stats$gastroc - sign_gene_stats$shared_sig_genes,
"soleus" = sign_gene_stats$soleus - sign_gene_stats$shared_sig_genes,
"gastroc&soleus" = sign_gene_stats$shared_sig_genes,
"gastroc&soleus&both down" = sign_gene_stats$`both down`,
"gastroc&soleus&both up" = sign_gene_stats$`both up`,
"gastroc&soleus&ga up, sol down" = sign_gene_stats$`ga up, sol down`,
"gastroc&soleus&ga down, sol up" = sign_gene_stats$`ga down, sol up`
)
# 'gastroc&soleus': if this line is omitted, then all intersects will be denoted
# with 0 (which was the goal to omit)
# 1st option: eulerr
plot(
euler(gene_sets),
quantities = T,
legend = list(side = "right"),
fills = venn.colors,
main = "significant genes"
)
## Warning in colSums(id & !empty) == 0 | merged_sets: longer object length is not a multiple of shorter object length
# only the two tissues
gene_sets <- c(
"gastrocnemius" = sign_gene_stats$gastroc - sign_gene_stats$shared_sig_genes,
"soleus" = sign_gene_stats$soleus - sign_gene_stats$shared_sig_genes,
"gastroc&soleus" = sign_gene_stats$shared_sig_genes
)
# 2nd option: euler two tissues
p <- plot(
euler(gene_sets),
quantities = T,
legend = list(side = "right"),
fills = venn.colors,
main = "significant genes"
)
ggsave("./plots/03_euler.svg", p)
## Saving 7 x 5 in image
ggsave("./plots/03_euler.png", p)
## Saving 7 x 5 in image
p
# 2nd option: venn two tissues
library(eulerr)
p <- plot(
eulerr::venn(gene_sets),
fills = params$tissue_pal,
main = "significant genes"
)
ggsave("./plots/03_venn.svg", p)
## Saving 7 x 5 in image
ggsave("./plots/03_venn.png", p)
## Saving 7 x 5 in image
p
Replacing the raw counts with normalized counts for better visual representation of the performed DEA:
counts.gastroc <- counts(dds.gastroc, normalized = T)
counts.soleus <- counts(dds.soleus, normalized = T)
zscore <- function(M) {
s <- apply( M,1,sd ) # standard deviation
µ <- apply( M, 1, mean ) # mean
M.z <- (M - µ) / s # zscore
return(M.z)
}
# obtaining all DE genes, but avoiding duplicates
sign_genes <-
unique(c(
row.names(gastroc_res.filtered),
row.names(soleus_res.filtered)
))
# making sure, that DE genes occur in both tissues
sign_genes <-
sign_genes[sign_genes %in% rownames(counts.gastroc) &
sign_genes %in% rownames(counts.soleus)]
counts_sign <- merge(
counts.gastroc[sign_genes,],
counts.soleus[sign_genes,],
by = 0,
suffixes = c("_gastrocnemius", "_soleus")
) %>%
tibble::column_to_rownames(var="Row.names") %>%
as.matrix()
plotCHM <- function(counts.df) {
# http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3
morecols <- colorRampPalette(params$heatmap_pal)
tissue_vec <- c(rep("gastroc", 12), rep("soleus", 12))
condition_vec <- c(rep(c(rep("WT", 6), rep("KO", 6)),2))
top_annot <-
HeatmapAnnotation(
tissue = tissue_vec,
condition = condition_vec,
col = list(tissue = params$tissue_pal, condition = params$condition_pal),
gp = gpar(col = "darkgray"),
show_legend = FALSE
)
## sample numbers
sample_nrs <- colnames(counts.df) %>%
stringr::str_extract("(?<=_)\\d{1,2}(?=_)")
colnames(counts.df) = sample_nrs
chm <- Heatmap(
counts.df,
row_title = "differentially expressed genes",
name = "zscore",
show_row_names = FALSE,
show_column_names = TRUE,
column_title = "samples",
col = morecols(50),
column_title_side = "bottom",
top_annotation = top_annot,
column_names_gp = gpar(col = rep(unname(params$tissue_pal), each=12))
)
# creating custom annotation legend (to obtain the gray border)
condition_legend = Legend(
labels = c("WT", "KO"),
legend_gp = gpar(fill = params$condition_pal),
border = "darkgray",
title = "condition"
)
tissue_legend = Legend(
labels = c("gastrocnemius", "soleus"),
legend_gp = gpar(fill = unname(params$tissue_pal)),
border = "darkgray",
title = "tissue"
)
legend_list <- list(condition_legend, tissue_legend)
draw(chm, annotation_legend_list = legend_list)
}
chm <- plotCHM(zscore(counts_sign))
svglite::svglite("./plots/03_heatmap_sign_genes.svg", width=6, height=6)
chm
dev.off()
png("./plots/03_heatmap_sign_genes.png", width=6, height=6, units = "in", res = 1200)
chm
dev.off()
## quartz_off_screen
## 2
## quartz_off_screen
## 2
PCA is applied to the same significantly DE genes (normalized) as used in the heatmap.
pca <- prcomp(t(counts_sign), scale. = T)
pca.data <- data.frame(Sample = rownames(pca$x),
X = pca$x[, 1],
Y = pca$x[, 2]) %>%
mutate(condition = substr(Sample, 1, 2),
tissue = stringr::str_extract(Sample,"[:alpha:]+$"),
sample_nr = stringr::str_extract(Sample, "(?<=_)\\d{1,2}(?=_)"))
p <-
autoplot(
pca,
data = pca.data,
colour = 'tissue',
shape = 'condition',
label.label = "sample_nr",
label = TRUE,
label.show.legend = FALSE,
label.repel = TRUE
) +
# ggtitle("PCA significantly DE genes") +
scale_color_manual(values = unname(params$tissue_pal)) +
theme_bw()
ggsave(filename = "plots/02_pca_dea_sign.svg", p)
## Saving 7 x 5 in image
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider increasing max.overlaps
ggsave(filename = "plots/02_pca_dea_sign.png", p)
## Saving 7 x 5 in image
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider increasing max.overlaps
p
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider increasing max.overlaps
save(dds.gastroc, dds.soleus, res.gastroc, res.soleus, file = "./data/Robjects/03_DDS.RData")